In [1]:
from indicnlp.morph import unsupervised_morph
In [2]:
morph = unsupervised_morph.UnsupervisedMorphAnalyzer("bn")
In [25]:
text = u"""\
করা করেছিলাম করেছি করতে করেছিল হয়েছে হয়েছিল হয় হওয়ার হবে আবিষ্কৃত আবিষ্কার অভিষিক্ত অভিষেক অভিষেকের আমি আমার আমাদের তুমি তোমার তোমাদের বসা বসেছিল বসে বসি বসেছিলাম বস বসার\
"""
In [26]:
word_token = text.split(" ")
In [5]:
word_morph = []
for i in word_token:
word_morph.append(morph.morph_analyze(i))
In [6]:
import pandas as pd
In [7]:
indic = pd.DataFrame({"1_Word": word_token, "2_Morpheme": word_morph})
In [8]:
indic
Out[8]:
In [9]:
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator
In [10]:
bangla_text = "ami apni tumi tomar tomader amar apnar apnader akash"
In [11]:
text_trans = ItransTransliterator.from_itrans(bangla_text, "bn")
In [12]:
print repr(text_trans).decode("unicode_escape")
In [13]:
from transliteration import getInstance
In [14]:
trans = getInstance()
In [15]:
text_trans = trans.transliterate(bangla_text, "bn_IN")
In [16]:
print repr(text_trans).decode("unicode_escape")
In [17]:
import rbs
In [18]:
word_stem1 = []
for i in word_token:
word_stem1.append(rbs.stemWord(i, True))
In [19]:
bs1 = pd.DataFrame({"1_Word": word_token, "2_Stem": word_stem1})
In [20]:
bs1
Out[20]:
In [21]:
import jnius_config
jnius_config.set_classpath(".", "path to class")
In [22]:
from jnius import autoclass
In [23]:
cls = autoclass("RuleFileParser")
In [24]:
stemmer = cls()
In [27]:
word_stem2 = []
for i in word_token:
word_stem2.append(stemmer.stemOfWord(i))
In [28]:
bs2 = pd.DataFrame({"1_Word": word_token, "2_Stem": word_stem2})
In [29]:
bs2
Out[29]:
In [30]:
from pyavrophonetic import avro
In [31]:
trans_text = avro.parse(bangla_text)
In [32]:
print repr(trans_text).decode("unicode_escape")